import numpy as np
import pandas as pd

#load the skill score functions
def HK_skill_score(A, B, C, D):
    HK = (A * D - C * B) / ((A + B) * (C + D))
    return HK

def Accuracy(A,B,C,D):
    Acc = (A+D) / (A+B+C+D)
    return Acc

def Balanced_Accuracy(A,B,C,D):
    Bal_Acc = (((A+D) / (A+B+C+D)) + ((C+B) / (A+B+C+D))) / 2
    return Bal_Acc
#load the data
indicator_data = pd.read_csv('C:/Users/joepb/PycharmProjects/data_storage/indicator_data_only.csv')
df_timetorain = pd.read_csv('C:/Users/joepb/PycharmProjects/data_storage/time_to_rain.csv')
del indicator_data['Date']
del df_timetorain['datetime']
df_timetorain = df_timetorain['sameday']

#load the data fro 2020
indicator_data = pd.read_csv('C:/Users/joepb/PycharmProjects/data_storage/indicator_data_only_2020.csv')
df_timetorain = pd.read_csv('C:/Users/joepb/PycharmProjects/data_storage/time_to_rain_2020.csv')
df_timetorain = df_timetorain['sameday']
del indicator_data['Date']
del indicator_data['Duck']
del indicator_data['mosquito']
del indicator_data['Other']

clf2 = RandomForestClassifier(random_state=42)
clf3 = BernoulliNB()
clf4 = SVC(probability=True, random_state=42)

#Run the SFS backward or foreward, comment out one of them.
X = indicator_data
Y = df_timetorain
list_random = [2,3,6,42]
list = [1,2,3,4,5,6,7,8,9,10]
result_sfs_backward = pd.DataFrame(index=np.arange(7),columns=np.arange(11))
# result_sfs_forward = pd.DataFrame(index=np.arange(7),columns=np.arange(8))
for q in list_random:
    for i in list:
        RF = RandomForestClassifier(max_features = 'sqrt',random_state = q, n_estimators=100 )
        loocv = LeaveOneOut()
        sfs = SequentialFeatureSelector(RF,direction = 'backward',cv=loocv,n_features_to_select = i,scoring='balanced_accuracy')
        sfs.fit(X, Y)
        result_sfs_backward.loc[str(i)+'_'+str('q'*q),:] = sfs.get_support()
        print(i+q)

#Save the results of foreward or backward SFS

# result_sfs_forward.dropna(inplace=True)
# result_sfs_forward.reset_index(inplace=True)
# del result_sfs_forward['index']
# result_sfs_forward.set_axis(indicator_data.columns, axis='columns', inplace=True)
# result_sfs_forward_int = result_sfs_forward.astype(int)
# result_sfs_forward_int.to_csv('C:/Users/joepb/PycharmProjects/data_storage/indicator_selection_foreward.csv',index=False)

result_sfs_backward.dropna(inplace=True)
result_sfs_backward.reset_index(inplace=True)
del result_sfs_backward['index']
result_sfs_backward.set_axis(indicator_data.columns, axis='columns', inplace=True)
result_sfs_backward_int = result_sfs_backward.astype(int)
result_sfs_backward_int.to_csv('C:/Users/joepb/PycharmProjects/data_storage/indicator_selection_backward_2022.csv',index=False)

#test the newly found indicators and see if they perform better (the enitre training is done here again)

# clf1 = LogisticRegression(random_state=42)
clf2 = RandomForestClassifier(random_state=r)
clf3 = BernoulliNB()
clf4 = SVC(probability=True, random_state=r)
models = []
models.append(('RF', RandomForestClassifier(random_state = r, n_estimators=100 )))
# models.append(('VC',VotingClassifier(estimators=[('RF', clf2), ('BNB', clf3), ('SVC', clf4)], voting='soft', weights=[2,1,1])))
# models.append(('NN',MLPClassifier(solver='lbfgs', alpha=1e-5,hidden_layer_sizes=(5, 2), random_state=1,max_iter=5000)))


ML_results_ind_back = pd.DataFrame(index = np.arange(0))
ML_results_probability_back = pd.DataFrame(index = np.arange(len(indicator_data)),columns=np.arange(0))


list = []
for j in result_sfs_backward.index:
    for i in result_sfs_backward:
        if result_sfs_backward.loc[j,i] == False:
            list.append(i)

    indicator_data = pd.read_csv('C:/Users/joepb/PycharmProjects/data_storage/indicator_data_only.csv')
    del indicator_data['Date']
    indicator_data.drop(indicator_data.columns.intersection(list),axis=1,inplace =True)
    list = []
    for r in list_random:
        clf2 = RandomForestClassifier(random_state=r, n_estimators=100)
        clf3 = BernoulliNB()
        clf4 = SVC(probability=True, random_state=r)

        models = []
        # models.append(('BNB', BernoulliNB()))
        models.append(('RF', RandomForestClassifier(max_features=1, random_state=2, n_estimators=100)))
        # models.append(('SVM', SVC(gamma='auto',probability=True,random_state= r)))
        # models.append(('VC', VotingClassifier(estimators=[('RF', clf2), ('BNB', clf3), ('SVC', clf4)], voting='soft',weights=[2, 1, 1])))
        # models.append(('NN', MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(5, 2), random_state=r, max_iter=5000)))

        warnings.filterwarnings("ignore", category=UserWarning, message="y_pred contains classes not in y_true")

        for name, model in models:
            X = indicator_data
            Y = df_timetorain
            # kfold = StratifiedKFold(n_splits=5, random_state=1, shuffle=True)
            loocv = LeaveOneOut()
            # cv_score = cross_val_score(model, X, Y.values.ravel(), cv=loocv, scoring='accuracy')
            cv_predict = cross_val_predict(model, X, Y, cv=loocv, method='predict_proba')

            df_timetorain = pd.DataFrame(df_timetorain)
            df_timetorain['test_result'] = 0
            m = 0
            for no_rain, rain in cv_predict:
                if no_rain > rain:
                    df_timetorain.loc[m, 'test_result'] = 0
                if no_rain < rain:
                    df_timetorain.loc[m, 'test_result'] = 1
                m += 1
            A = 0
            B = 0
            C = 0
            D = 0
            for e in df_timetorain.index:
                if df_timetorain.loc[e, 'sameday'] == 1:
                    if df_timetorain.loc[e, 'test_result'] == 1:
                        A += 1
                    if df_timetorain.loc[e, 'test_result'] == 0:
                        B += 1
                if df_timetorain.loc[e, 'sameday'] == 0:
                    if df_timetorain.loc[e, 'test_result'] == 1:
                        C += 1
                    if df_timetorain.loc[e, 'test_result'] == 0:
                        D += 1
            df_timetorain = df_timetorain['sameday']
            ML_results_ind_back.loc['Accuracy', name + str(r) + '_'+ str(j)] = Accuracy(A, B, C, D)
            ML_results_ind_back.loc['HK_score', name + str(r) + '_'+ str(j)] = HK_skill_score(A, B, C, D)
            ML_results_probability_back.loc[:, 'probability_rain_' + name + str(r) + '_'+ str(j)] = cv_predict[:, 1]
            ML_results_probability_back.loc[:, 'probability_no_rain_' + name + str(r) + '_'+ str(j)] = cv_predict[:, 0]
            print(name + str(r) + '_'+ str(j))
            print(HK_skill_score(A, B, C, D))


ML_results_ind_backward = ML_results_ind_back.copy()
ML_results_ind_backward['VC42_40'] =  ML_results_ind_kfold.loc[:,'VC42']
ML_results_ind_backward['VC2_40'] =  ML_results_ind_kfold.loc[:,'VC2']
ML_results_ind_backward['VC3_40'] =  ML_results_ind_kfold.loc[:,'VC3']
ML_results_ind_backward['VC6_40'] =  ML_results_ind_kfold.loc[:,'VC6']

ML_results_ind_backward.to_csv('C:/Users/joepb/PycharmProjects/data_storage/ML_RF_RF_backward_2022.csv',index=False)

ML_results_ind_backward_RF = ML_results_ind_back.copy()
ML_results_ind_backward_RF['RF42_40'] =  ML_results_ind_kfold.loc[:,'RF42']
ML_results_ind_backward_RF['RF2_40'] =  ML_results_ind_kfold.loc[:,'RF2']
ML_results_ind_backward_RF['RF3_40'] =  ML_results_ind_kfold.loc[:,'RF3']
ML_results_ind_backward_RF['RF6_40'] =  ML_results_ind_kfold.loc[:,'RF6']

ML_results_ind_backward_RF.to_csv('C:/Users/joepb/PycharmProjects/data_storage/ML_RF_RF_backward_2022.csv',index=False)



# ML_results_ind_forward = ML_results_ind_back.copy()
# ML_results_ind_forward['VC'] = ML_results_ind['VC']
# ML_results_ind_forward = ML_results_ind_forward.rename(columns={'VC0':1, 'VC1':2, 'VC2':3, 'VC3':4, 'VC4':5, 'VC5':6, 'VC6':7, 'VC':8})
# ML_results_ind_forward.to_csv('C:/Users/joepb/PycharmProjects/data_storage/ML_RF_forward.csv',index=False)


ML_results_ind_backward = pd.read_csv('C:/Users/joepb/PycharmProjects/data_storage/ML_RF_RF_backward_2022.csv')
ML_results_ind_forward = pd.read_csv('C:/Users/joepb/PycharmProjects/data_storage/ML_RF_RF_forward.csv')

std_dev_rndm_test_HK_back = pd.DataFrame()
HK_mean_back = pd.DataFrame()

#Becasue we varied the random parameter, we have 4 predictions and therefore need to calculate the skill 40 times

# We first check it for wehn we use the RF algorithm and then when we use the VC algorithm (see thesis)

#RF
list_random_back = [0,1,2,3,4,5,6,7,8,9]
list_back_runs = [0,10,20,30]
list_same_runs = []
y=0

for b in list_random_back:
    list_same_runs = []
    for e in list_back_runs:
        a = b + e
        for r in list_random:
            # list_random_back = [x + y for x in list_random_back]
            list_same_runs.append('RF' + str(r) + '_' + str(a))

    std_dev_rndm_test_HK_back.loc['Standard deviation','RF' + str(a)] = np.std(ML_results_ind_backward_RF.loc['HK_score',ML_results_ind_backward_RF.columns.isin(list_same_runs)].values)
    HK_mean_back.loc['Mean','RF' + str(a)] = np.mean(ML_results_ind_backward_RF.loc['HK_score',ML_results_ind_backward_RF.columns.isin(list_same_runs)].values)

    if a == 39:
        a = 40
        list_same_runs = []
        for r in list_random:
            # list_random_back = [x + y for x in list_random_back]
            list_same_runs.append('RF' + str(r) + '_' + str(40))
    std_dev_rndm_test_HK_back.loc['Standard deviation','RF' + str(a)] = np.std(ML_results_ind_backward_RF.loc['HK_score',ML_results_ind_backward_RF.columns.isin(list_same_runs)].values)
    HK_mean_back.loc['Mean','RF' + str(a)] = np.mean(ML_results_ind_backward_RF.loc['HK_score',ML_results_ind_backward_RF.columns.isin(list_same_runs)].values)

std_dev_rndm_test_HK_back.to_csv('C:/Users/joepb/PycharmProjects/data_storage/sfs_back_2022_RF_std.csv')
HK_mean_back.to_csv('C:/Users/joepb/PycharmProjects/data_storage/sfs_back_2022_RF_HK.csv')
std_dev_rndm_test_HK_back_RF = std_dev_rndm_test_HK_back.copy()
HK_mean_back_RF = HK_mean_back.copy()

std_dev_rndm_test_HK_back = pd.DataFrame()
HK_mean_back = pd.DataFrame()

#VC
list_random_back = [0,1,2,3,4,5,6,7,8,9]
list_back_runs = [0,10,20,30]
list_same_runs = []
y=0

for b in list_random_back:
    list_same_runs = []
    for e in list_back_runs:
        a = b + e
        for r in list_random:
            # list_random_back = [x + y for x in list_random_back]
            list_same_runs.append('VC' + str(r) + '_' + str(a))

    std_dev_rndm_test_HK_back.loc['Standard deviation','VC' + str(a)] = np.std(ML_results_ind_backward.loc['HK_score',ML_results_ind_backward.columns.isin(list_same_runs)].values)
    HK_mean_back.loc['Mean','VC' + str(a)] = np.mean(ML_results_ind_backward.loc['HK_score',ML_results_ind_backward.columns.isin(list_same_runs)].values)

    if a == 39:
        a = 40
        list_same_runs = []
        for r in list_random:
            # list_random_back = [x + y for x in list_random_back]
            list_same_runs.append('VC' + str(r) + '_' + str(40))
    std_dev_rndm_test_HK_back.loc['Standard deviation','VC' + str(a)] = np.std(ML_results_ind_backward.loc['HK_score',ML_results_ind_backward.columns.isin(list_same_runs)].values)
    HK_mean_back.loc['Mean','VC' + str(a)] = np.mean(ML_results_ind_backward.loc['HK_score',ML_results_ind_backward.columns.isin(list_same_runs)].values)


std_dev_rndm_test_HK_back.to_csv('C:/Users/joepb/PycharmProjects/data_storage/sfs_back_2022_VC_std.csv')
HK_mean_back.to_csv('C:/Users/joepb/PycharmProjects/data_storage/sfs_back_2022_VC_HK.csv')
std_dev_rndm_test_HK_back_VC = std_dev_rndm_test_HK_back.copy()
HK_mean_back_VC = HK_mean_back.copy()


run1 = result_sfs_backward_int.iloc[0:10,:]
run2 = result_sfs_backward_int.iloc[10:20,:]
run3 = result_sfs_backward_int.iloc[20:30,:]
run4 = result_sfs_backward_int.iloc[30:40,:]

run1.reset_index(inplace=True)
run2.reset_index(inplace=True)
run3.reset_index(inplace=True)
run4.reset_index(inplace=True)
del run1['index']
del run2['index']
del run3['index']
del run4['index']

score = pd.DataFrame(columns= run1.columns,index = np.arange(5))

#I'm not quite sure anymore what happens here , but the figures for my thesis are also created.
runlist = [run1,run2,run3,run4]
b=0
for i in runlist:
    for l in i.columns:
        storage = [0]
        for p in i.index:
            storage.append(i.loc[p,l])
            if storage[-1] == 1:
                if storage[-2] ==0:
                    score.loc[b,l] = p
    b += 1
score.replace(np.NAN,10,inplace=True)

score.loc[4,:] = np.NAN
score.loc[4,:] = np.mean(score,axis = 0)

score.to_csv('C:/Users/joepb/PycharmProjects/data_storage/sfs_back_2022_RF_score.csv')



fig, (ax1, ax2) = plt.subplots(1, 2,sharex = True)

X1 = HK_mean_back.columns
Y1 = HK_mean_back.loc['HK_score',:]

X2 = ML_results_ind_forward.columns
Y2 = ML_results_ind_forward.loc['HK_score',:]

ax1.plot(X1, Y1, 'bx',linestyle='-')
ax2.plot(X2, Y2,'bx',linestyle='-')

ax1.set_xlabel('number of indicators')
ax2.set_xlabel('number of indicators')
ax1.set_ylabel("HK skill score")

ax1.set_ylim(0.2,0.5)
ax2.set_ylim(0.2,0.5)

ax1.text(1.5,0.48,s = 'a', fontsize='x-large', verticalalignment='top', fontfamily='serif')

ax2.text(1.5,0.48,s = 'b', fontsize='x-large', verticalalignment='top', fontfamily='serif')

# bbox=dict(facecolor='0.7', edgecolor='none', pad=3.0)
# plt.title("Number of Students in each group")
# plt.legend()
plt.savefig('C:/Users/joepb/OneDrive/Documenten/Wageningen - Uni/Master Thesis/Draft thesis figures and docs/backward_foreward_selection_balanced_accuracy.png',bbox_inches= 'tight')
plt.show()


HK_mean_back = HK_mean_back.rename(columns = {'RF30':1,'RF31':2,'RF32':3,'RF33':4,'RF34':5,'RF35':6,'RF36':7,'RF37':8,'RF38':9,'RF39':10,'RF40':11})


fig = plt.figure()
ax1 = fig.add_subplot(111)

X1 = HK_mean_back.columns
Y1 = HK_mean_back.loc['Mean',:]


ax1.plot(X1, Y1, 'bx',linestyle='-', label='Indicator forecast')

y_error = std_dev_rndm_test_HK_back.loc['Standard deviation',:].values
error = ax1.errorbar(X1, Y1, color='r',yerr=y_error,fmt="none", label='Standard deviation')


ax1.set_xlabel('number of indicators')
ax1.set_ylabel("HK skill score")
ax1.set_xticks(X1)

ax1.set_ylim(0.0,0.5)

# ax1.text(1.5,0.48,s = 'a', fontsize='x-large', verticalalignment='top', fontfamily='serif')

handles, labels = ax1.get_legend_handles_labels()
ax1.legend(handles, labels)

plt.savefig('C:/Users/joepb/OneDrive/Documenten/Wageningen - Uni/Master Thesis/Draft thesis figures and docs/backward_selection_balanced_accuracy_2022_RF.png',bbox_inches= 'tight')
plt.show()

# bbox_to_anchor=(0.48, 0.4),
# ax2.text(1.5,0.48,s = 'b', fontsize='x-large', verticalalignment='top', fontfamily='serif')

# bbox=dict(facecolor='0.7', edgecolor='none', pad=3.0)
# plt.title("Number of Students in each group")
# plt.legend()
